/* 1.data_prep.do
This file extracts the original CVR dataset, merges it with the INEI geocode, selects the samples used in the analysis, and creates descriptive statistics

Original data are publicly available at https://www.dropbox.com/s/3op8vxq5yn1wzo5/Intermuestra1-2.zip
*/

global name "s1_data_prep"
global main "C:/Users/silvio/Documents/CVR/ryp"
global output "$main/output"  
global data "${main}/data"  
global odata "${data}/Intermuestra1-2"  
global odata1 "${data}/UBIgeo2002"  
global code "${main}/code"  

set more off
cd $main
capture log close
log using "${output}/${name}.log", replace
clear

*****************************************
*Geocode file from the INEI
*****************************************
insheet using "$odata1/ubigeo2002.csv"
*insheet using ubigeo2002.txt
ren  coddpto idepa
ren  codprov iprov
ren  coddist idist
replace idepa=int(idepa)
replace iprov=int(iprov)
replace idist=int(idist)
gen i=idepa*1000+iprov*100+idist

ren nombre lugar
sort idepa iprov idist
save "$data/ubigeo2002", replace
clear
*****************************************

*****************************************
* Original dataset
*****************************************
insheet using "$odata/IntermuestraV1-2.csv"

*Keeping the order
gen id=_n

gen perpe=3	
replace perpe=1 if agente=="EST" | agente=="PAR" | agente=="RON"
replace perpe=2 if agente=="SLU"

label variable perpe "Perpetrador"
label def perpe 1 "EST"  2 "SLU" 3 "OTH"
label values perpe perpe

*Table 3 of BASM 
tab cvr dp if odh==0 & perpe==0
tab cvr dp if odh==0 & perpe==1
tab cvr dp if odh==0 & perpe==2

tab cvr dp if odh==1 & perpe==0
tab cvr dp if odh==1 & perpe==1
tab cvr dp if odh==1 & perpe==2

gen perpe1=perpe
replace perpe1=4 if agente==""
replace perpe1=4 if agente=="NOD"
replace perpe1=4 if agente=="ENF"
replace perpe1=4 if agente=="EMR"
replace perpe1=4 if agente=="ESL"

gen perpe2=perpe1
replace perpe2=. if perpe2==4

g miss=(perpe1==4)

g age= anohechos- anonac
replace age=. if age<0 | age>110

g situa=0 if situacion=="DES"
replace situa=1 if situacion=="MUE"


************************************************
*We construct the 58 strata of BASM using the GeoCode of INEI
************************************************
* We eliminate first the last four digits of "Ubihechos", which are not used anyway.
************************************************
gen i=ubihechos/10000
replace i=int(i)
gen idepa=int(i/10000)
gen iprov=int((i-idepa*10000)/100)
gen idist=int(i-idepa*10000-iprov*100)

sort idepa iprov idist

merge idepa iprov idist using "$data/ubigeo2002"
drop if _merge==2
sort id
drop _merge
save "$data/cvrextrapo", replace

do "$code/strata58" /*Creates j that defines 58 strata of BASM*/

gen source=4*odh+2*dp+cvr+1
gen sl=0
replace sl=1 if perpe==2
gen esta=0	
replace esta=1 if perpe==1
gen source1=0
replace source1=1 if source==2

gen sexon=0
replace sexon=1 if sexo=="F"
label variable sexon "Sexo"
label def sexon 0 "Male" 1 "Female"
label values sexon sexon



gen i2=ubihechos-i*10000


g excluded=j==.
g otheri=perpe2==3 if perpe==3
label def other 0 "Missing" 1 "Identified"
label values otheri other
label def excludedl 0 "In TRC Strata" 1 "Excluded"
label values excluded excludedl
label var otheri "Other"
label def sourcel 2 "C" 3 "D" 4 "CD"	5 "N" 6 "CN" 7	"DN" 8 "CDN"
la val source sourcel

* Table 1
tab perpe source
tab otheri source

* Table 2
tab perpe excluded
tab otheri excluded

* Table 4
tab j perpe
tab j otheri

save "$data/cvrextrapo", replace

scalar nstrata=58

gen y=1
sort j perpe source
collapse (sum) y, by(perpe j source)

replace source=source-1
drop if j==.
drop if perpe==.

fillin j perpe source
recode y (. = 0)

bys j perpe: g e=1+(_N==_n)
expand e
replace source=8 if _n>nstrata*8*3
replace y=-4 if _n>nstrata*8*3
sort j perpe source
drop _fillin e
gen n=_n


keep n perpe j source j y
ren source k
g i=j
order n perpe i k j y 

save "$data/datacvr", replace
clear
capture log close
